Pandas - For Data Analysis. Uses Structures and tables and time series Almost always called in with <lower case "pd">¶
Numpy - bade of scientific computing called in with lower case np¶
Matplotlib.pyplot - allows for static and animated vizualization. called in with lower case "plt".¶
Note: in order to have graphs properly show in jupyter lavs, we use %matplotlib inline command¶
seaborn - based on matplotli. provides attractive graphics. called "sns"¶
Data Structures¶
In [ ]:
In [68]:
## Import Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
import pip
import sys
!{sys.executable} -m pip install xgboost
Requirement already satisfied: xgboost in c:\users\user\anaconda3\lib\site-packages (3.0.5) Requirement already satisfied: numpy in c:\users\user\anaconda3\lib\site-packages (from xgboost) (2.1.3) Requirement already satisfied: scipy in c:\users\user\anaconda3\lib\site-packages (from xgboost) (1.15.3)
In [69]:
# Create Timestamp
from datetime import datetime
time_now = datetime.now().strftime('%m_%d_%Y_%I%M%p')
print("Current Time: {}".format(time_now))
Current Time: 09_15_2025_0911AM
In [70]:
#import warning filter - otherwise you'll get lots of warnings that dont affect the outcome
from warnings import simplefilter
#ignore all future warnings
simplefilter(action='ignore',category=FutureWarning)
In [71]:
df=pd.read_csv('diabetes.csv')
In [72]:
df.head()
Out[72]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
In [73]:
df.describe()
Out[73]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
| mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
| std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
| 50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
| 75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
| max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
In [74]:
df.columns
Out[74]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
dtype='object')
In [75]:
df.shape
Out[75]:
(768, 9)
In [76]:
#look at the relationship
sns.pairplot(df,hue="Outcome")
#plt.show()
Out[76]:
<seaborn.axisgrid.PairGrid at 0x1d6e59255b0>
In [77]:
#check for null values
print(df.isnull().values.any())
#if you have rows with missing date, use df=df.dropna() to get rid of rows null values
False
In [78]:
# we need to standardize the variable so that we can accurately use machine learning. our object is to have the dtandard deviations all between 0 and 1
X=df.drop('Outcome',axis=1)
y=df['Outcome']
#standardize variables
X=(X-X.mean())/X.std()
In [79]:
X.head()
Out[79]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.639530 | 0.847771 | 0.149543 | 0.906679 | -0.692439 | 0.203880 | 0.468187 | 1.425067 |
| 1 | -0.844335 | -1.122665 | -0.160441 | 0.530556 | -0.692439 | -0.683976 | -0.364823 | -0.190548 |
| 2 | 1.233077 | 1.942458 | -0.263769 | -1.287373 | -0.692439 | -1.102537 | 0.604004 | -0.105515 |
| 3 | -0.844335 | -0.997558 | -0.160441 | 0.154433 | 0.123221 | -0.493721 | -0.920163 | -1.040871 |
| 4 | -1.141108 | 0.503727 | -1.503707 | 0.906679 | 0.765337 | 1.408828 | 5.481337 | -0.020483 |
In [80]:
X.describe()
Out[80]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | |
|---|---|---|---|---|---|---|---|---|
| count | 7.680000e+02 | 7.680000e+02 | 7.680000e+02 | 7.680000e+02 | 7.680000e+02 | 7.680000e+02 | 7.680000e+02 | 7.680000e+02 |
| mean | -6.938894e-17 | -1.156482e-17 | 1.272131e-17 | 9.367507e-17 | -1.676899e-17 | 2.613650e-16 | 2.359224e-16 | 2.104798e-16 |
| std | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 |
| min | -1.141108e+00 | -3.781190e+00 | -3.570271e+00 | -1.287373e+00 | -6.924393e-01 | -4.057829e+00 | -1.188778e+00 | -1.040871e+00 |
| 25% | -8.443348e-01 | -6.847901e-01 | -3.670975e-01 | -1.287373e+00 | -6.924393e-01 | -5.951906e-01 | -6.885198e-01 | -7.857741e-01 |
| 50% | -2.507887e-01 | -1.218083e-01 | 1.495433e-01 | 1.544326e-01 | -4.277835e-01 | 9.413653e-04 | -2.999328e-01 | -3.606124e-01 |
| 75% | 6.395305e-01 | 6.053764e-01 | 5.628560e-01 | 7.186174e-01 | 4.117396e-01 | 5.843897e-01 | 4.659233e-01 | 6.597757e-01 |
| max | 3.904034e+00 | 2.442886e+00 | 2.732747e+00 | 4.918660e+00 | 6.648507e+00 | 4.452906e+00 | 5.879733e+00 | 4.061069e+00 |
In [81]:
# Split data into 75% training and 25% learning
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
Assistant¶
The error occurs because the variables X and y are not defined before being used in the train_test_split function. You need to define these variables before splitting the data.
Would you like me to provide the corrected code?
In [82]:
#logistical regression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,y_train)
Out[82]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [92]:
#LR predictions
lr_preds=lr.predict(X_test)
lr_preds
Out[92]:
array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1])
In [84]:
# use Decision Tree Model
from sklearn.tree import DecisionTreeClassifier
dt= DecisionTreeClassifier()
dt.fit(X_train,y_train)
Out[84]:
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [85]:
#dt predictions
dt_preds=dt.predict(X_test)
dt_preds
Out[85]:
array([1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0])
In [86]:
#random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
Out[86]:
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [87]:
#rf Predicitons
rf_preds=rf.predict(X_test)
rf_preds
Out[87]:
array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1])
In [88]:
#XG boost
from xgboost import XGBClassifier
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
Out[88]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
feature_weights=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=None, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=None,
n_jobs=None, num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
feature_weights=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=None, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=None,
n_jobs=None, num_parallel_tree=None, ...)In [89]:
#XGB preditions
xgb_preds=xgb.predict(X_test)
xgb_preds
Out[89]:
array([1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1])
In [94]:
#evaluate the accuracy of the models
model=np.array(['LogisticRegression','DecisionTree','Random Forest','XGBoost'])
from sklearn.metrics import accuracy_score
scores = np.array([accuracy_score(lr_preds,y_test),accuracy_score(dt_preds,y_test),accuracy_score(rf_preds,y_test),accuracy_score(xgb_preds,y_test)])
In [97]:
df= {'model':model,'scores':scores}
sns.barplot(x='model',y='scores',data=df)
plt.show()
In [102]:
#Lest's assume we have new data set with no outcome that we want to predict.
df1=pd.read_csv('diabetes.csv')
base_data=df1.sample(frac=.25)
base_data=df1.drop(columns=["Outcome"])
base_data.head()
lr_preds=lr.predict(base_data)
dt_preds=dt.predict(base_data)
rf_preds=rf.predict(base_data)
xgb_preds=xgb.predict(base_data)
In [103]:
#add predicted data to diabetes file with outcome column for comparision
df1["Logistic Regression"]=lr_preds
df1["Decision Tree"]=dt_preds
df1["Random Forest"]=rf_preds
df1["XGBoost"]=xgb_preds
df1.head()
Out[103]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | Logistic Regression | Decision Tree | Random Forest | XGBoost | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 | 1 | 1 | 1 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 | 1 | 1 | 1 | 1 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 | 1 | 1 | 1 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 | 1 | 1 | 1 | 1 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 | 1 | 1 | 1 | 1 |
In [104]:
#export new file to excel
df1.to_excel('Diabetes with 4 ML predictions.xlsx')
In [ ]: